R scripts to accompany SC20 paper “GPU Lifetimes on Titan Supercomputer: Survival Analysis and Reliability” by George Ostrouchov, Don Maxwell, Rizwan Ashraf, Christian Engelmann, Mallikarjun Shankar, and James Rogers.

options(repos = c(CRAN = "http://cran.rstudio.com")) # set CRAN mirror
installed = installed.packages()
if(! "remotes" %in% installed) install.packages("remotes")
if(! ("forestmodel" %in% installed && packageVersion("forestmodel") >= "0.6.1"))
  remotes::install_github("NikNakk/forestmodel") # only package from GitHub
## all other packages will prompt to install from CRAN
library(data.table)
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(survival)
library(survminer)
## Loading required package: ggpubr
library(forestmodel)
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forestmodel_0.6.2   survminer_0.4.8.999 ggpubr_0.4.0       
##  [4] survival_3.1-12     lubridate_1.7.9     forcats_0.5.0      
##  [7] ggplot2_3.3.2       dplyr_1.0.2         magrittr_1.5       
## [10] data.table_1.13.0  
## 
## loaded via a namespace (and not attached):
##  [1] zoo_1.8-8        tidyselect_1.1.0 xfun_0.16        purrr_0.3.4     
##  [5] splines_4.0.2    haven_2.3.1      lattice_0.20-41  carData_3.0-4   
##  [9] colorspace_1.4-1 vctrs_0.3.2      generics_0.0.2   htmltools_0.5.0 
## [13] yaml_2.2.1       survMisc_0.5.5   rlang_0.4.7      pillar_1.4.6    
## [17] foreign_0.8-80   glue_1.4.1       withr_2.2.0      readxl_1.3.1    
## [21] lifecycle_0.2.0  stringr_1.4.0    munsell_0.5.0    ggsignif_0.6.0  
## [25] gtable_0.3.0     cellranger_1.1.0 zip_2.1.0        evaluate_0.14   
## [29] knitr_1.29       rio_0.5.16       curl_4.3         broom_0.7.0     
## [33] Rcpp_1.0.5       xtable_1.8-4     scales_1.1.1     backports_1.1.8 
## [37] abind_1.4-5      km.ci_0.5-2      gridExtra_2.3    hms_0.5.3       
## [41] digest_0.6.25    stringi_1.4.6    openxlsx_4.1.5   rstatix_0.6.0   
## [45] KMsurv_0.1-5     grid_4.0.2       tools_4.0.2      tibble_3.0.3    
## [49] crayon_1.3.4     tidyr_1.1.1      car_3.0-9        pkgconfig_2.0.3 
## [53] ellipsis_0.3.1   Matrix_1.2-18    rmarkdown_2.3    R6_2.4.1        
## [57] compiler_4.0.2

The above documents software versions and platform that was used to run the analysis.

out_figs = "../figs/" # paper related figures will go here
data_dir = "../data/" # input files live here
history_file = paste0(data_dir, "titan.gpu.history.txt")
service_nodes = paste0(data_dir, "titan.service.txt")

source("TitanGPUsetup.R") ## define some additional functions
Rmd.time = proc.time()

Read the data.

Last inventory files processing was done on 01/20/2020 but Titan was turned off on 08/01/2019 so change these remove dates.

g_raw = fread(history_file, col.names = c("SN", "location", "insert", "remove"))
lastrun_dates = stringr::str_starts(g_raw$remove, "01/20/2020")
g_raw$remove[lastrun_dates] = "08/01/2019 20:07:33"
cat("Total of", sum(lastrun_dates), "remove dates changed to lights-out date")
## Total of 18642 remove dates changed to lights-out date
cat("Total of", (unique_SN = length(unique(g_raw$SN))), "unique SN\n")
## Total of 30794 unique SN
cat("Total of", (unique_loc = length(unique(g_raw$location))),
    "unique locations\n")
## Total of 19185 unique locations

Enable DST corrected dates for insert and remove with Eastern time zone. Enable missing values, and create some new variables. Variable event is “life” when both insert and remove are present, “life0” when insert = remove, otherwise it is whatever string is in insert, which is “DBE” or “Off The BUS” (re-coded to “OTB”). Fill in (repeat) serial numbers and locations for all records.

time_zone = "America/New_York"
g_ev = g_raw %>% 
  mutate(event = if_else(is.na(lubridate::mdy_hms(insert, quiet = TRUE)),
                         insert,  # if not a date, event is the insert string
                         "life"), # if a date, it's a life span
         event = if_else(event == "Off The BUS", "OTB", event), # recode "Off The BUS" to "OTB"
         location = na_if(location, ""),  # blank to NA for zoo later
         insert = lubridate::mdy_hms(insert, tz = time_zone, quiet = TRUE),
         remove = lubridate::mdy_hms(remove, tz = time_zone), # !quiet incase NA
         duration = lubridate::as.duration(remove - insert), # in seconds
         event = as.factor(if_else(event == "life" & as.numeric(duration) == 0,
                                   "life0", event)),
         SN = ifelse(SN == "", as.character(NA), SN), # blank to NA for zoo
         SN = zoo::na.locf(SN), # fill in missing SN (forward)
         SN_ok = grepl("[0-9]{13}", SN), # verify that SN is 13 numbers
         location = zoo::na.locf(location), # fill in missing location forward
         location_ok = grepl("c[0-9][0-9]?-[0-7]c[0-2]s[0-7]n[0-3]", location))

List bad SN and bad location raw data. Remove bad SN and bad location data. Cause is unknown but there are very few.

good_date_remove = lubridate::mdy_hms("01/01/2014 00:00:00", tz = time_zone)
good_date_insert = lubridate::mdy_hms("11/01/2013-00:00:00", tz = time_zone)
good_date_last = lubridate::mdy_hms("12/01/2016 00:00:00", tz = time_zone)

g_raw[!g_ev$SN_ok, ]  # list bad SN records
##          SN    location              insert              remove
## 1: Bad_Read  c6-7c0s2n1 12/16/2012 20:07:03 12/16/2012 20:07:03
## 2:           c3-1c2s6n2 12/29/2012 01:52:15 01/22/2013 16:56:02
## 3:          c18-1c2s1n0 03/25/2013 15:28:34 03/28/2013 22:17:56
## 4:           c5-7c0s5n1 10/10/2012 17:27:01 10/10/2012 17:27:01
## 5:          c16-1c2s0n2 11/16/2012 19:46:09 11/28/2012 14:05:41
## 6:           c1-0c2s2n1 09/15/2015 10:29:47 09/15/2015 20:33:20
g_raw[!g_ev$location_ok, ] # list bad location records
## Empty data.table (0 rows and 4 cols): SN,location,insert,remove

Read service nodes and remove associated data.

service = fread(service_nodes, col.names = c("node", "loc", "location", "type", "up", "run"))
cat("Total of", (unique_sloc = length(unique(service$location))),
    "unique service locations\n")
## Total of 512 unique service locations
service_notseen =
  service$location[!(service$location %in% unique(g_ev$location))]
cat("Service locations not referenced in data:", service_notseen, "\n")
## Service locations not referenced in data: c24-0c0s0n0 c24-0c0s0n1 c24-0c0s2n0 c24-0c0s2n1 c24-0c0s4n0 c24-0c0s4n1 c24-0c0s6n0 c24-0c0s6n1 c24-0c0s6n2 c24-0c0s6n3 c24-0c0s4n2 c24-0c0s4n3 c24-0c0s2n2 c24-0c0s2n3 c24-0c0s0n2 c24-0c0s0n3
gc_ev = g_ev %>%
  filter(SN_ok, location_ok, !(location %in% service$location)) %>%
  select(-SN_ok, -location_ok) # remove service node data (must be after zoo!)
cat("Total of", (unique_SN = length(unique(gc_ev$SN))), "unique clean SN\n")
## Total of 30786 unique clean SN
cat("Total of", (unique_loc = length(unique(gc_ev$location))),
    "unique clean locations\n")
## Total of 18688 unique clean locations

Make a chronology plot of Titan GPU life. Overlay yearly histogram over daily histogram. Yearly excludes data before good_date_remove. Both plots are restricted to “life” records only. (Note to reviewers: an earlier manuscript version included “life0” “DBE” “OTB” events in this chronology, artificially inflating GPU swap numbers, mostly in the first year.)

ggp(ggplot(filter(g_ev %>% filter(event == "life"), remove < max(remove)), aes(remove)) + 
      geom_histogram(
        data = filter(g_ev, remove >= good_date_remove, remove < max(remove)),
        mapping = aes(remove), binwidth = 60*60*24*365, alpha = 0.2,
        boundary = good_date_remove) +
      geom_histogram(binwidth = 60*60*24, color = "blue") + theme_bw() +
      theme(axis.title.x = element_blank()) + scale_x_datetime(breaks = "1 year"), #+
#      scale_y_continuous(limits = c(0, 9250), breaks = seq(0, 9000, 1000)),
    file = "chronology", width = 10, height = 3)

Begin data selection by removing all GPU life times that have remove date before good_date and all that have zero life times. The premise is that record keeping was different and possibly inconsistent before r good_date. gc_ev_nzg is used in many analyses going forward. Five original SNs remain after this and are removed manually to have a clean “post 2nd rework cycle” data set.

gc_ev_nzg = gc_ev %>%
  filter(remove > good_date_remove, # drop GPUs removed before good_date!
         !(SN %in% c("0323712022786", "0323712007923", "0323712007994",
                   "0323712022956", "0323712042970")), # rm insert dates < 2013-06
         event != "life0") %>% mutate(event = fct_drop(event))  # remove life0's
last_inventory = max(gc_ev_nzg$remove, na.rm = TRUE) # last right-censoring date
cat("gc_ev dim", dim(gc_ev), "processed into gc_ev_nzg dim", dim(gc_ev_nzg), "\n")
## gc_ev dim 110205 6 processed into gc_ev_nzg dim 42878 6
cat("before good_date", sum(gc_ev$remove <= good_date_remove, na.rm = TRUE), ": life0",
    sum(gc_ev$event == "life0", na.rm = TRUE), "remove is NA", sum(is.na(gc_ev$remove)), "\n")
## before good_date 67113 : life0 9010 remove is NA 0

Mark time overlaps within SN, then remove the overlap record and plot before-after. SN time overlaps are possible when an inventory is incomplete. A unit is not detected as removed, yet is detected as inserted in a different location - thus the SN appears at both locations until the next inventory.

gc_life = mark.overlaps(gc_ev_nzg %>% filter(event == "life") %>%
                          select(-event), SN)
gc_ev_fail = gc_ev_nzg %>% select(SN, remove, location, event) %>%
  filter(event != "life") %>% arrange(SN, remove, location) %>%
  mutate(event = fct_drop(event))
plot.life_ev(gc_life, gc_ev_nzg, SN, overlaps = TRUE, outs = FALSE, file = "overlap_SN")
## There are 165 units with life overlap:
## [1] "Overlap life time 2157174961s (~68.36 years) of total life time 3184438765556s (~100908.78 years) ( 0.000677411349319308 proportion )"

Now the same SN after removing overlap records:

gc_life = gc_life %>% filter(!overlap_rec)
plot.life_ev(gc_life, gc_ev_nzg, SN, overlaps = TRUE, outs = FALSE, file = "overlap_SN_after") 
## There are 165 units with life overlap:
## [1] "Overlap life time 0s of total life time 3182281590595s (~100840.42 years) ( 0 proportion )"

For locations, remove only the overlapping life but keep the rest of the SN. The overlaps seem very short so this makes sense. Why do location overlaps exist? In some cases, they seem to be the full blade of 4 GPUs. Blades were sometimes hot-swapped by CRAY and if this was during an inventory run, the results could be unpredictable.

gc_life = mark.overlaps(gc_life, location)
plot.life_ev(gc_life, gc_ev_nzg, location, overlaps = TRUE, outs = FALSE, 
             file = "overlap_Loc")
## There are 160 units with life overlap:
## [1] "Overlap life time 631679420s (~20.02 years) of total life time 3182281590595s (~100840.42 years) ( 0.000198498907785811 proportion )"

Print raw data for a few overlap locations to understand reasons:

for(i in which(g_raw$location == "c7-7c0s7n3")) print(g_raw[(i-5):(i+5), ])
##                SN    location              insert              remove
##  1:                c0-4c0s3n1 11/01/2013 21:52:42 11/04/2013 17:22:58
##  2:               c16-1c2s4n2 09/16/2012 18:00:54 02/23/2013 17:11:13
##  3:               c17-4c0s7n1 02/10/2014 11:00:31 03/25/2018 17:19:12
##  4:                                           DBE 03/25/2018 17:19:12
##  5:               c18-3c0s7n2 03/25/2013 15:28:34 05/29/2013 11:54:11
##  6: 0323812068631  c7-7c0s7n3 09/30/2012 12:20:00 01/25/2013 15:29:58
##  7:                c3-7c1s3n0 01/21/2014 10:28:50 08/01/2019 20:07:33
##  8:                c2-7c1s3n0 09/19/2013 12:09:13 09/21/2013 20:01:42
##  9:                c5-3c1s5n3 05/29/2013 11:54:11 05/29/2013 11:54:11
## 10: 0323712027827 c20-4c0s1n1 01/21/2014 10:28:50 04/14/2018 19:13:27
## 11:                                           DBE 04/14/2018 19:13:27
##                SN    location              insert              remove
##  1: 0323812023604  c7-7c2s5n0 09/30/2012 12:20:00 01/25/2013 15:29:58
##  2:                c1-7c1s1n1 01/21/2014 10:28:50 10/27/2018 17:56:15
##  3:                                           DBE 10/27/2018 17:56:15
##  4:                c5-3c2s7n3 05/29/2013 11:54:11 05/29/2013 11:54:11
##  5:                c0-7c1s1n1 09/19/2013 12:09:13 09/21/2013 20:01:42
##  6: 0323812005717  c7-7c0s7n3 01/21/2014 10:28:50 08/01/2019 20:07:33
##  7:                c9-5c1s5n3 05/29/2013 11:54:11 05/29/2013 11:54:11
##  8:               c17-2c0s0n1 09/28/2012 10:29:48 02/02/2013 11:32:29
##  9:                c2-7c0s7n3 09/27/2013 16:19:07 09/30/2013 08:55:37
## 10: 0323712044649  c2-7c1s2n0 01/21/2014 10:28:50 08/01/2019 20:07:33
## 11:                c0-7c1s2n0 12/13/2013 23:55:26 12/16/2013 19:18:06
##                SN    location              insert              remove
##  1: 0323812008667  c0-3c1s5n0 10/11/2013 15:57:33 10/12/2013 22:09:31
##  2:               c11-7c2s3n3 05/29/2013 11:54:11 05/29/2013 11:54:11
##  3:               c17-2c2s0n0 09/28/2012 10:29:48 02/02/2013 11:32:29
##  4:               c13-3c1s5n0 01/21/2014 10:28:50 02/07/2017 10:20:57
##  5: 0323812006432 c16-5c0s1n3 06/25/2014 14:26:01 08/01/2019 20:07:33
##  6:                c7-7c0s7n3 05/20/2014 09:46:13 06/10/2014 10:33:15
##  7:               c14-4c1s5n0 04/02/2013 17:02:59 05/29/2013 11:54:11
##  8:                c9-5c1s5n1 01/21/2014 10:28:50 02/10/2014 11:00:31
##  9:                c2-3c0s4n1 11/13/2013 17:11:57 11/15/2013 16:10:19
## 10:               c17-7c0s0n2 09/28/2012 10:29:48 02/02/2013 11:32:29
## 11: 0320617015500 c13-4c1s6n1 04/09/2017 21:36:19 08/01/2019 20:07:33
##                SN    location              insert              remove
##  1: 0323812023669  c0-3c1s2n1 11/08/2013 15:33:11 11/11/2013 12:02:58
##  2:               c13-1c2s0n1 04/09/2013 16:16:47 05/29/2013 11:54:11
##  3:                c0-3c1s3n1 11/05/2013 17:04:09 11/05/2013 17:04:09
##  4:               c19-6c2s4n1 09/23/2012 16:21:09 02/02/2013 11:32:29
##  5:               c18-3c1s2n1 01/21/2014 10:28:50 02/07/2017 10:20:57
##  6: 0323812008531  c7-7c0s7n3 05/29/2013 11:54:11 05/29/2013 11:54:11
##  7:                c3-0c2s3n0 09/30/2012 12:20:00 01/25/2013 15:29:58
##  8:                c0-5c2s4n1 09/27/2013 16:19:07 09/30/2013 08:55:37
##  9:                c5-5c2s4n1 01/21/2014 10:28:50 11/16/2016 15:07:07
## 10:                                   Off The BUS 11/16/2016 15:07:07
## 11: 0323812026163  c3-1c0s2n3 01/21/2014 10:28:50 09/29/2014 09:08:40
for(i in which(g_raw$location == "c22-0c2s0n0")) print(g_raw[(i-5):(i+5), ])
##                SN    location              insert              remove
##  1:               c12-0c1s7n0 01/21/2014 10:28:50 09/02/2016 20:00:09
##  2:                                           DBE 09/02/2016 20:00:09
##  3:                c8-5c0s7n0 02/23/2013 17:11:13 05/29/2013 11:54:11
##  4:                c1-6c0s0n3 09/30/2012 12:20:00 01/25/2013 15:29:58
##  5: 0323612015890  c0-0c2s2n0 10/21/2013 14:28:19 10/28/2013 17:52:44
##  6:               c22-0c2s0n0 09/15/2012 21:53:50 01/11/2013 12:41:19
##  7:               c23-0c2s2n0 01/21/2014 10:28:50 04/27/2016 13:40:30
##  8:                                           DBE 04/27/2016 13:40:30
##  9:               c19-7c1s5n2 03/19/2013 15:48:11 05/29/2013 11:54:11
## 10:                c5-1c2s2n2 01/23/2013 00:44:44 01/25/2013 15:29:58
## 11: 0323712062514 c13-7c0s7n0 01/21/2014 10:28:50 08/01/2019 20:07:33
##                SN    location              insert              remove
##  1:               c20-7c0s3n0 09/15/2012 21:53:50 02/23/2013 17:11:13
##  2:               c21-7c2s1n1 03/19/2013 15:48:11 05/29/2013 11:54:11
##  3: 0323812057010 c10-2c0s3n0 03/12/2019 16:25:48 08/01/2019 20:07:33
##  4: 0323712022496 c14-6c1s7n2 03/22/2018 15:52:52 07/22/2018 18:10:30
##  5:                c1-1c2s3n2 01/21/2014 10:28:50 07/11/2017 18:04:25
##  6: 0323712023814 c22-0c2s0n0 01/18/2013 07:42:44 02/23/2013 17:11:13
##  7:                c2-5c1s3n2 10/31/2012 13:56:42 11/27/2012 00:11:10
##  8:               c15-2c1s6n2 01/21/2014 10:28:50 12/28/2017 09:35:23
##  9:                c5-4c1s1n2 08/14/2018 11:23:03 01/02/2019 22:20:52
## 10:                                           DBE 01/02/2019 22:20:52
## 11:               c12-2c2s2n2 09/21/2012 13:15:36 10/29/2012 12:33:02
##                SN    location              insert              remove
##  1:                c9-2c0s1n1 05/29/2013 11:54:11 05/29/2013 11:54:11
##  2:               c15-6c0s2n0 09/28/2012 10:29:48 02/02/2013 11:32:29
##  3: 0324216130635 c12-1c1s7n0 11/08/2016 18:47:06 08/01/2019 20:07:33
##  4: 0323812057031  c2-0c1s7n0 10/21/2013 14:28:19 10/28/2013 17:52:44
##  5:               c22-0c2s0n1 08/18/2015 11:09:42 04/26/2016 16:24:45
##  6:               c22-0c2s0n0 01/21/2014 10:28:50 07/28/2015 10:26:59
##  7:               c13-5c2s5n1 11/16/2012 19:46:09 02/02/2013 11:32:29
##  8:               c21-5c1s7n1 03/19/2013 15:48:11 05/29/2013 11:54:11
##  9: 0325216047499  c8-6c1s0n0 02/07/2017 10:20:57 08/01/2019 20:07:33
## 10: 0323812008661  c1-7c0s0n1 01/21/2014 10:28:50 09/01/2018 03:12:07
## 11:                c5-3c2s1n2 09/30/2012 12:20:00 01/25/2013 15:29:58
##                SN    location              insert              remove
##  1:               c19-5c2s6n0 03/19/2013 15:48:11 05/29/2013 11:54:11
##  2:               c11-7c0s0n0 04/22/2014 11:54:48 06/02/2014 14:38:24
##  3:               c23-7c1s2n0 09/23/2012 16:21:09 02/23/2013 17:11:13
##  4:               c11-7c0s6n0 07/08/2014 12:34:05 06/23/2015 09:14:10
##  5:               c11-7c0s2n0 01/21/2014 10:28:50 04/15/2014 21:24:30
##  6: 0323712028748 c22-0c2s0n0 08/26/2014 18:56:46 04/26/2016 16:24:45
##  7:                c4-2c1s2n1 12/13/2013 23:55:26 12/16/2013 19:18:06
##  8:                c0-7c2s1n1 10/21/2012 15:57:20 01/25/2013 15:29:58
##  9:                c0-3c1s4n1 05/29/2013 11:54:11 05/29/2013 11:54:11
## 10:                c4-7c0s1n0 09/16/2014 10:47:25 09/16/2014 21:35:56
## 11:                c6-2c1s2n1 01/21/2014 10:28:50 03/25/2014 17:47:33
##                SN    location              insert              remove
##  1:               c20-4c1s6n3 04/02/2013 17:02:59 05/29/2013 11:54:11
##  2:               c12-7c2s7n1 10/27/2012 16:30:22 02/23/2013 17:11:13
##  3:               c13-7c2s7n1 09/28/2012 10:29:48 10/26/2012 16:02:32
##  4:                c2-0c0s5n3 11/05/2013 17:04:09 11/11/2013 12:02:58
##  5: 0325216048007 c23-7c2s1n0 04/09/2017 21:36:19 08/01/2019 20:07:33
##  6: 0323812024316 c22-0c2s0n0 05/17/2016 12:03:10 11/16/2016 14:07:37
##  7: 0323712061484  c5-0c0s3n2 09/30/2012 12:20:00 01/25/2013 15:29:58
##  8:               c11-7c1s0n0 01/21/2014 10:28:50 11/17/2017 14:56:10
##  9:                                           DBE 11/17/2017 14:56:10
## 10:                c2-7c1s0n0 10/04/2013 15:42:29 10/04/2013 19:53:27
## 11:               c13-6c0s0n1 05/29/2013 11:54:11 05/29/2013 11:54:11
##                SN    location              insert              remove
##  1:               c15-1c1s7n2 01/21/2014 10:28:50 10/12/2016 03:54:41
##  2:                                           DBE 10/12/2016 03:54:41
##  3:               c19-4c2s7n0 03/19/2013 15:48:11 05/29/2013 11:54:11
##  4:               c24-2c1s7n3 10/24/2012 16:00:10 02/23/2013 17:11:13
##  5:               c24-3c1s1n3 10/17/2012 20:56:05 10/23/2012 16:54:52
##  6: 0323812007697 c22-0c2s0n0 08/18/2015 11:09:42 08/18/2015 14:58:25
##  7:                                           DBE 08/18/2015 13:02:49
##  8:                                           DBE 08/18/2015 14:58:25
##  9:               c22-0c2s0n1 01/21/2014 10:28:50 07/30/2015 16:58:36
## 10:                                           DBE 07/30/2015 16:58:36
## 11:               c21-5c0s1n1 03/19/2013 15:48:11 05/29/2013 11:54:11
##                SN    location              insert              remove
##  1:                c7-4c1s5n0 05/29/2013 11:54:11 05/29/2013 11:54:11
##  2: 0325016204642 c18-1c1s0n2 04/09/2017 21:36:19 08/01/2019 20:07:33
##  3: 0323712027480 c20-1c2s2n0 01/21/2014 10:28:50 05/03/2016 06:21:00
##  4:                                           DBE 05/03/2016 06:21:00
##  5:                c2-1c2s2n0 11/05/2013 17:04:09 11/11/2013 12:02:58
##  6:               c22-0c2s0n0 03/13/2013 11:42:44 05/29/2013 11:54:11
##  7:               c17-5c1s3n3 10/09/2012 16:32:54 11/04/2012 16:13:03
##  8: 0323712007868 c15-6c1s4n3 03/22/2013 16:12:17 05/29/2013 11:54:11
##  9:                c5-5c1s6n1 08/01/2017 16:32:25 09/19/2017 21:13:39
## 10:               c20-2c2s2n2 01/22/2013 12:51:29 02/23/2013 17:11:13
## 11:               c15-0c1s7n3 01/21/2014 10:28:50 02/07/2017 10:20:57
##                SN    location              insert              remove
##  1:               c23-1c0s1n3 03/13/2013 11:42:44 03/25/2013 15:28:34
##  2:               c17-7c2s0n0 01/21/2014 10:28:50 06/16/2017 20:47:22
##  3:                                           DBE 06/16/2017 20:28:27
##  4:                                   Off The BUS 06/16/2017 20:47:22
##  5:               c19-3c2s7n2 09/23/2012 16:21:09 02/02/2013 11:32:29
##  6: 0324216129884 c22-0c2s0n0 12/06/2016 13:23:18 08/01/2019 20:07:33
##  7: 0320317054765 c13-3c1s3n1 04/09/2017 21:36:19 08/01/2019 20:07:33
##  8: 0323812023855 c24-6c2s4n2 03/13/2013 11:42:44 05/29/2013 11:54:11
##  9:               c13-7c0s2n1 09/28/2012 10:29:48 02/02/2013 11:32:29
## 10:               c22-2c1s3n3 01/21/2014 10:28:50 02/07/2017 10:20:57
## 11:                c2-2c1s3n3 10/21/2013 14:28:19 10/28/2013 17:52:44

Plot overlap at locations again after removing the offending life spans.

gc_life = gc_life %>% filter(!overlap_rec)
plot.life_ev(gc_life, gc_ev_nzg, location, overlaps = TRUE, outs = FALSE, 
             file = "overlap_Loc_after")
## There are 160 units with life overlap:
## [1] "Overlap life time 0s of total life time 3181649911175s (~100820.4 years) ( 0 proportion )"

After removing overlaps, determine out = “last seen” record that may indicate censoring

gc_life = gc_life %>% group_by(SN) %>%
  mutate(out = if_else(remove == max(remove) & remove < last_inventory,
                                     TRUE, FALSE)) %>% ungroup()

Inventory interval analysis. Use differences between sorted unique insert and remove dates of “life” records to determine inventory intervals.

life_insert = gc_life$insert # collect insert dates
life_remove = gc_life$remove # collect remove dates
dates_life = sort(unique(c(life_insert, life_remove))) # unique insert,remove

life_intervals =
  data.frame(days = as.numeric(diff(dates_life))/(60*60*24),
             year = factor(as.character(year(dates_life[-1])))) %>% 
  filter(days >= 1) %>% 
  mutate(days = factor(as.integer(floor(days)), levels = 1:56))
ggp(ggplot(life_intervals, aes(days, fill = year)) + stat_count() +
      scale_x_discrete(breaks = seq(1, 56, 2), drop = FALSE) + theme_bw(),
    file = "attention_intervals", height = 3)

Seems longest interval without an inventory is 56 days!

Next, sample a few SN and locations to display a sense of GPU life for the paper.

## set sampling for SN and Locations display
nsample = 90 # the max number for a full page pair of figures
set.seed(76513)  # for reproducibility!
fig_height = as.integer(nsample*0.1 + 0.5)

Plot a sample of 90 GPUs to check that things look right … * starts: black dots (gray for zero lifetime) * lifetimes: black lines * DBEs: red triangles * OTB: blue triangles * Last seen: black ]

SN_sample = sample(unique(gc_ev_nzg$SN), nsample)
ev_sample = gc_ev_nzg %>% filter(SN %in% SN_sample)
life_sample = gc_life %>% filter(SN %in% SN_sample)
plot.life_ev(life_sample, ev_sample, SN, "sample_SN", overlaps = FALSE)

Plot a sample of 90 locations …

loc_sample = sample(unique(gc_ev_nzg$location), nsample)
ev_sample = gc_ev_nzg %>% filter(location %in% loc_sample)
life_sample = gc_life %>% filter(location %in% loc_sample) %>% 
  tidyr::separate(location, c(NA, "col", "row", "cage", "slot", "node"),
           sep = "[-csn]", convert = TRUE, remove = FALSE) %>%
  mutate(location = reorder(location, col)) # orders in increasing col
plot.life_ev(life_sample, ev_sample, location, "sample_loc", overlaps = FALSE)

Reduce to only “life” records and mark them with ending DBE, OTB, “out”, or none events. The result is just insert and remove with event marks to indicate DBE or OTB or “out” (last seen). bad indicates that a DBE or OTB occurred yet the GPU was not taken out. For a first cut, keep the bad ones in. Also keep duration to later aggregate into full life times.

gc_life = gc_life %>% select(SN, location, insert, remove, duration, out) %>%
  arrange(SN, remove, location)
gc_fulljoin = gc_life %>% full_join(gc_ev_fail, by = c("SN", "remove", "location"))
gc_full = gc_fulljoin %>% group_by(SN, remove, location) %>%
  mutate(insert = if_else(row_number() > 1, as.POSIXct(NA), insert),
         duration = if_else(row_number() > 1, as.numeric(NA), as.numeric(duration)),
         out = if_else(row_number() > 1, as.logical(NA), out)) %>% ungroup()

Output gc_full data frame to a CSV file for TBF analysis with Python code

gc_write = as.data.frame(lapply(gc_full, as.character), stringsAsFactors = FALSE)
readr::write_csv(gc_write, paste0(data_dir, "gc_full.csv"), na = "", col_names = TRUE)

Message to reader of file: All fields are character strings. Dates are with timezone. Missing values as “” strings.

Print raw records for Figure 3:

for(i in which(g_raw$SN == "0323812007945")) print(g_raw[(i-5):(i+5), ])
##                SN    location              insert              remove
##  1:               c11-4c0s6n2 05/29/2013 11:54:11 05/29/2013 11:54:11
##  2:                c2-4c0s7n3 10/04/2013 15:42:29 10/04/2013 19:53:27
##  3: 0320117100112 c16-2c1s6n0 04/09/2017 21:36:19 08/01/2019 20:07:33
##  4: 0324216130424  c0-1c2s6n0 11/08/2016 18:47:06 02/07/2017 10:20:57
##  5:                c1-2c2s3n0 04/09/2017 21:36:19 08/01/2019 20:07:33
##  6: 0323812007945 c17-4c1s3n1 09/28/2012 10:29:48 02/02/2013 11:32:29
##  7:               c20-6c1s5n2 07/23/2019 11:25:33 08/01/2019 20:07:33
##  8:               c13-1c1s3n3 01/21/2014 10:28:50 07/11/2017 18:04:25
##  9:                c0-1c1s3n3 10/11/2013 15:57:33 10/12/2013 22:09:31
## 10:               c21-1c2s5n0 03/19/2013 15:48:11 05/29/2013 11:54:11
## 11: 0323712024086  c0-5c1s2n3 11/13/2013 17:11:57 11/15/2013 16:10:19
for(i in which(g_raw$SN == "0325216047736")) print(g_raw[(i-5):(i+5), ])
##                SN    location              insert              remove
##  1:               c18-1c1s2n1 03/25/2013 15:28:34 05/29/2013 11:54:11
##  2:                c0-3c1s3n0 11/08/2013 15:33:11 11/11/2013 12:02:58
##  3: 0320617016429  c3-3c2s2n3 04/09/2017 21:36:19 08/01/2019 20:07:33
##  4: 0320617015461  c7-5c2s4n3 07/11/2017 18:04:25 08/01/2019 20:07:33
##  5: 0324917127701 c17-1c0s4n0 03/22/2018 15:52:52 08/01/2019 20:07:33
##  6: 0325216047736 c18-4c1s5n1 04/09/2017 21:36:19 08/01/2019 20:07:33
##  7: 0323812006185 c24-3c0s2n3 01/21/2014 10:28:50 08/01/2019 20:07:33
##  8:               c23-3c0s6n1 09/23/2012 16:21:09 02/23/2013 17:11:13
##  9:               c17-7c1s4n2 05/29/2013 11:54:11 05/29/2013 11:54:11
## 10:               c17-7c2s4n2 03/22/2013 16:12:17 04/09/2013 16:16:47
## 11:                c2-3c0s3n3 10/11/2013 15:57:33 10/12/2013 22:09:31
for(i in which(g_raw$SN == "0323812008856")) print(g_raw[(i-5):(i+5), ])
##                SN    location              insert              remove
##  1:                c6-1c2s4n1 02/23/2013 17:11:13 05/29/2013 11:54:11
##  2:               c10-1c0s6n0 10/03/2012 04:02:20 01/25/2013 15:29:58
##  3:               c12-3c1s0n0 09/21/2012 13:15:36 10/02/2012 20:16:53
##  4: 0324816010830  c1-0c2s7n1 04/09/2017 21:36:19 08/01/2019 20:07:33
##  5: 0321516011869 c16-0c2s7n1 07/12/2016 19:12:05 08/01/2019 20:07:33
##  6: 0323812008856  c5-4c0s7n0 09/30/2012 12:20:00 01/25/2013 15:29:58
##  7:                c0-6c1s7n2 10/21/2013 14:28:19 10/28/2013 17:52:44
##  8:                c3-3c1s5n0 05/29/2013 11:54:11 05/29/2013 11:54:11
##  9:               c23-6c1s7n2 01/21/2014 10:28:50 11/02/2018 14:42:34
## 10:                                           DBE 11/02/2018 14:42:34
## 11: 0323812025087  c5-1c0s6n3 01/21/2014 10:28:50 08/01/2019 20:07:33

Next, aggregate into one record per SN with total life time and first insert time. Indicate if event occurred or still in service: out = fail, dbe = fail, otb = fail, NA = right censored (still in service). Add some other variables, such as GPU in single location or moved one or more times, mark new_batch group, etc. to use in modeling later.

This is where life times are determined, connected with a location and with an event (OTB, DBE, out). Note that location is imprecise if more than one is used (we use longest). Get proportion of time at reference location time_max_loc.

gc_summary = gc_full %>% arrange(SN, remove) %>% group_by(SN) %>%
  summarize(
    time = sum(duration, na.rm = TRUE), # total life time
    nlife = sum(!is.na(duration)), # number of life records
    nloc = length(unique(location)), # Number of locations where lived
    last = max(remove), # last time stamp
    max_loc = ifelse(!all(is.na(insert)), location[which.max(duration)], ""),
    max_loc_events = ifelse(all(is.na(event)), 0,
                            length(event[location == max_loc])),
    time_max_loc = sum(duration[location == max_loc], na.rm = TRUE)/time,
    dbe = sum(event == "DBE", na.rm = TRUE),
    dbe_loc = ifelse(all(is.na(duration)) | dbe == 0, NA,
                     max_loc %in% location[event == "DBE"]),
    otb = sum(event == "OTB", na.rm = TRUE),
    otb_loc = ifelse(all(is.na(duration)) | otb == 0, NA,
                     max_loc %in% location[event == "OTB"]),
    out = any(out, na.rm = TRUE), # removed
    batch = if_else(min(insert, na.rm = TRUE) >
                      lubridate::mdy_hms("01/01/2016 00:00:00", tz = time_zone), "new", "old")
  )
## `summarise()` ungrouping output (override with `.groups` argument)
gc_summary
## # A tibble: 30,207 x 14
##    SN                time nlife  nloc last                max_loc     max_loc_events time_max_loc   dbe dbe_loc   otb otb_loc out   batch
##    <chr>            <dbl> <int> <int> <dttm>              <chr>                <dbl>        <dbl> <int> <lgl>   <int> <lgl>   <lgl> <chr>
##  1 0320117100060 72916274     1     1 2019-08-01 20:07:33 c11-2c1s4n1              0            1     0 NA          0 NA      FALSE new  
##  2 0320117100061 72916274     1     1 2019-08-01 20:07:33 c9-3c1s2n1               0            1     0 NA          0 NA      FALSE new  
##  3 0320117100063 72916274     1     1 2019-08-01 20:07:33 c6-2c1s2n0               0            1     0 NA          0 NA      FALSE new  
##  4 0320117100064 72916274     1     1 2019-08-01 20:07:33 c10-2c1s5n1              0            1     0 NA          0 NA      FALSE new  
##  5 0320117100065 72916274     1     1 2019-08-01 20:07:33 c3-2c1s2n0               0            1     0 NA          0 NA      FALSE new  
##  6 0320117100066 72916274     1     1 2019-08-01 20:07:33 c13-3c1s1n1              0            1     0 NA          0 NA      FALSE new  
##  7 0320117100068 72916274     1     1 2019-08-01 20:07:33 c12-2c1s3n1              0            1     0 NA          0 NA      FALSE new  
##  8 0320117100069 72916274     1     1 2019-08-01 20:07:33 c12-5c2s1n0              0            1     0 NA          0 NA      FALSE new  
##  9 0320117100070 72916274     1     1 2019-08-01 20:07:33 c14-7c2s3n1              0            1     0 NA          0 NA      FALSE new  
## 10 0320117100071 72916274     1     1 2019-08-01 20:07:33 c21-6c1s4n1              0            1     0 NA          0 NA      FALSE new  
## # … with 30,197 more rows

Separate max_loc into components (col, row, cage, slot, node) to be used as covariates in survival analysis. Note that this location is the place this SN was located the longest. This muddies the survival analysis somewhat as several locations can be the “treatment” for a given SN. But the proportions of time at max_loc tend to be very high (see later analysis below).

gc_summary_loc = gc_summary %>% tidyr::separate(max_loc, c(NA, "col", "row", "cage", "slot", "node"),
           sep = "[-csn]", convert = TRUE, remove = TRUE) %>% # c{col}-{row}c{cage}s{slot}n{node}
  mutate( col = as.factor(col), row = as.factor(row), cage = as.factor(cage), 
          slot = as.factor(slot), node = as.factor(node),
          days = time/(60*60*24),
          years = days/365,
          dead = out & dbe + otb > 0,
          dead_otb = out & otb > 0,
          dead_dbe = out & dbe > 0) # define fail & censored

gc_summary_loc_o = gc_summary_loc %>% filter(batch == "old")
gc_summary_loc_n = gc_summary_loc %>% filter(batch == "new")
gc_summary_loc_censor1 = gc_summary_loc %>% 
  mutate(dead = if_else(batch == "old" & years > 1, FALSE, dead), 
         days = if_else(batch == "old" & years > 1, 1*365, days),
         years = if_else(batch =="old" & years > 1, 1, years))  # censor at 1 year
gc_summary_loc_censor2 = gc_summary_loc %>% 
  mutate(dead = if_else(batch == "old" & years > 2, FALSE, dead), 
         days = if_else(batch == "old" & years > 2, 2*365, days),
         years = if_else(batch =="old" & years > 2, 2, years))  # censor at 2 years
gc_summary_loc_censor3 = gc_summary_loc %>% 
  mutate(dead = if_else(batch == "old" & years > 3, FALSE, dead), 
         days = if_else(batch == "old" & years > 3, 3*365, days),
         years = if_else(batch =="old" & years > 3, 3, years))  # censor at 3 years
gc_summary_loc_censor4 = gc_summary_loc %>% 
  mutate(dead = if_else(batch == "old" & years > 4, FALSE, dead), 
         days = if_else(batch == "old" & years > 4, 4*365, days),
         years = if_else(batch =="old" & years > 4, 4, years))  # censor at 4 years

This is the second data set we make available. Output gc_full to a CSV file

gc_write = as.data.frame(lapply(gc_summary_loc, as.character), stringsAsFactors = FALSE)
readr::write_csv(gc_write, paste0(data_dir, "gc_summary_loc.csv"), na = "", col_names = TRUE)

Get some stats for sanity check.

cat("Total", nrow(gc_summary_loc), "unique lifetimes\n")
## Total 30207 unique lifetimes
cat("  Censored:", sum(!gc_summary_loc$dead), "  dead:", sum(gc_summary_loc$dead),
    "  Known alive:", sum(!gc_summary_loc$out), "\n")
##   Censored: 25118   dead: 5089   Known alive: 18637
cat("  DBE or OTB during life:", sum(gc_summary_loc$dbe > 0 | gc_summary_loc$otb >0), "\n")
##   DBE or OTB during life: 5190
cat("           More than one:", sum(gc_summary_loc$dbe + gc_summary_loc$otb > 1, na.rm = TRUE), "\n")
##            More than one: 551

Nothing really remarkable in the stats. Censored + dead = total unique. Most with a DBE or OTB are marked dead. A few, 101 units (= 5190 - 5089) with DBEs or OTBs are still alive (that’s just under 2%). 551 units (about 10%) had more than one DBE or OTB.

Look at life proportion at longest location:

ggplot(gc_summary_loc, aes(x = time_max_loc)) + geom_histogram(bins = 200) +
  ggtitle("Life Proportion at Longest Location") + theme_bw()

ggplot((gc_summary_loc %>% filter(dead)), aes(x = time_max_loc)) +
      geom_histogram(bins = 200) + theme_bw() +
      ggtitle("Life Proportion at Longest Location - Dead Only")

The overwhelming majority of time is spent in the “longest” location so it is reasonable to assume the the longest location is the “treatment” of a GPU. But a time-dependent analysis is possible from the current data with some additional re-coding and following this time-dependent covariates technique: https://cran.r-project.org/web/packages/survival/vignettes/timedep.pdf. We hope that others will undertake this more detailed analysis from our data.

SN categories We tried disassembling the SN into digits but did not find compelling relationship with life times except the known new-batch old-batch differences.

Kaplan-Meier survival analysis gives the probability of survival beyond a given time. It is a nonparametric technique that makes no specific failure model assumptions, such as Weibull, Exponential, etc. It can also split the data into subpopulations based on the covariates and compute separate survival curves.

ggp(ggsurvplot(survfit(
  Surv(years, dead, type = 'right') ~ node, data = gc_summary_loc),
  facet.by = c("batch", "cage"), conf.int = TRUE, risk.table = TRUE,
  ggtheme = theme_bw(), xlab = "Years", legend.title = "Node", censor.size = 2,
  legend = c(0.08, 0.73), legend.labs = 0:3, ylim = c(0.08, 1), size = 0.4, 
  break.y.by = 0.1, censor = FALSE),
  width = 10.5, height = 6, file = "km_cage-node_a")
## Warning in (function (survsummary, times, survtable = c("cumevents", "risk.table", : The length of legend.labs should be 24
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

The Cox proportional hazards (CPH) regression analys takes the hazard function above and a set of covariates \(x\) and models \(k\)s GPU hazard as \[H_k(t) = H_0(t)e^{\sum\limits_{i=1}^n{\beta x}}\] That is, a base hazard rate multiplied by a function of covariates. The key assumption in the CPH model is that the “hazards” are multipliers on the “baseline hazard” but the estimate of the baseline hazard is nonparametric, meaning it makes no specific distributional assumption and just learns from the data (unlike analyses that assume for example a Weibul or an exponential model). The multiplicative assumption is that the hazard curves do not cross and are multiples of each other. The multiplier is the hazard coefficient. For example, if the baseline is node0 and the node2 hazard is 2, then node2 sees twice as many failures as node0 on average.

First, consider some diagnostics for proportionalty:

fit = coxph(Surv(years, dead, type = 'right') ~ row + col + cage + slot + node,
            data = gc_summary_loc_o)
test.ph = cox.zph(fit)
ggcoxzph(test.ph)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm): collapsing to unique 'x' values

The fit lines are supposed to be horizontal for the proportionality assumption to be satisfied. They are all nearly horizontal. On the other hand, all the significance tests can detect a departure from horizontal because we have so much data. We interpret this that we have enough data to do time-dependent analysis (see comment on this above) but that the hazard proportionality is fairly good. To err on the conservative side, we use the results to make qualitative conclusions and are careful to not over-interpret the actual hazard levels.

So now the CPH model:

response = "Surv(time, event = dead) ~"
gc_summary_loc_o = gc_summary_loc %>% filter(batch == "old")
gc_summary_loc_n = gc_summary_loc %>% filter(batch == "new")

col2torus = order(c(0, seq(1, 23, 2), seq(24, 2, -2)))
panels = list(
  forest_panel(width = 0.03),
  forest_panel(width = 0.10, display = variable, fontface = "bold",
               heading = "Variable"),
  forest_panel(width = 0.05, display = level, fontface = "bold"),
  forest_panel(width = 0.05, display = n, heading = "N"),
  forest_panel(width = 0.05, display = n_events, heading = "Events"),
  forest_panel(width = 0.03, item = "vline", hjust = 0.5),
  forest_panel(width = 0.91, item = "forest", line_x = 0, hjust = 0.5,
               heading = "Hazard ratio", linetype = "dashed"),
  forest_panel(width = 0.03))

covariates = c("col", "row", "cage", "slot", "node")
levels(gc_summary_loc_o$col) = 
  paste0(formatC(levels(gc_summary_loc_o$col), width = 2),
         "   (X-", formatC(col2torus, width = 2), ")")
mv_model = coxph(as.formula(paste(response, paste(covariates, collapse = " + "))),
                 data = gc_summary_loc_o)
ggp(forest_model(mv_model, panels, factor_separate_line = TRUE), width = 5,
    height = 7, file = "cox_o")

tcol = col2torus[gc_summary_loc_o$col]
gc_summary_loc_o = gc_summary_loc_o %>% mutate(col = reorder(col, tcol))
mv_model = coxph(as.formula(paste(response, paste(covariates, collapse = " + "))),
                 data = gc_summary_loc_o)
ggp(forest_model(mv_model, panels, factor_separate_line = TRUE), width = 5,
    height = 7, file = "cox_o_t")

levels(gc_summary_loc_n$col) = 
  paste0(formatC(levels(gc_summary_loc_n$col), width = 2),
         "   (X-", formatC(col2torus, width = 2), ")")
mv_model = coxph(as.formula(paste(response, paste(covariates, collapse = " + "))),
                 data = gc_summary_loc_n)
## Warning in fitter(X, Y, istrat, offset, init, control, weights = weights, : Loglik converged before variable 6,9,15 ; coefficient may be infinite.
ggp(forest_model(mv_model, panels, factor_separate_line = TRUE), width = 5,
    height = 7, file = "cox_n")

tcol = col2torus[gc_summary_loc_n$col]
gc_summary_loc_n = gc_summary_loc_n %>% mutate(col = reorder(col, tcol))
mv_model = coxph(as.formula(paste(response, paste(covariates, collapse = " + "))),
                 data = gc_summary_loc_n)
## Warning in fitter(X, Y, istrat, offset, init, control, weights = weights, : Loglik converged before variable 5,8,22 ; coefficient may be infinite.
ggp(forest_model(mv_model, panels, factor_separate_line = TRUE), width = 5,
    height = 7, file = "cox_n_t")

cat("Total time:\n"); print(proc.time() - Rmd.time)
## Total time:
##    user  system elapsed 
## 163.895   4.465 174.756